Test classification is done for the purpose of finding tags or catagories of the text according to their contents. In this analysis, the data set is a collection of 50,000 reviews from IMDB. I have taken the process data from https://www.kaggle.com/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews/data and orginal data is available in here http://ai.stanford.edu/~amaas/data/sentiment/. The purpose of this analysis was exploring the naive bayes classification with text data.
# Read The data
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
# Import the data and see the data type
data=pd.read_csv('C:/Users/mxm5116/Desktop/Data Mining/IMDB Dataset.csv')
data.head()
# Check the shape of the data
print(data.shape)
# Now lets, see the summary of the data set
data.describe()
# Check the positive and negative number of sentiment
data['sentiment'].value_counts()
# Import library
from bs4 import BeautifulSoup
import re,string,unicodedata
# Removing the html strips
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
# Remove the square brackets
def remove_between_square_brackets(text):
return re.sub('\[[^]]*\]', '', text)
# Remoove the noisy text
def denoise_text(text):
text = strip_html(text)
text = remove_between_square_brackets(text)
return text
#Apply function on review column
data['review']=data['review'].apply(denoise_text)
# Now remove special character and apply function for the review colums
def remove_special_characters(text, remove_digits=True):
pattern=r'[^a-zA-z0-9\s]'
text=re.sub(pattern,'',text)
return text
data['review']=data['review'].apply(remove_special_characters)
# Streaming the text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
def simple_stemmer(text):
ps=nltk.porter.PorterStemmer()
text= ' '.join([ps.stem(word) for word in text.split()])
return text
#Apply function on review column
data['review']=data['review'].apply(simple_stemmer)
data.head()
# Convert positive=1 and negative=0 as numeric
def posneg(x):
if x=="negative":
return 0
elif x=="positive":
return 1
return x
filtered_score = data["sentiment"].map(posneg)
data["score"] = filtered_score
data.head()
# Data Preparation for the model
from sklearn.model_selection import KFold, cross_val_score, train_test_split
import random
X = data['review'].values
y = data['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
train_voca='.'.join(X_train)
test_voca='.'.join(X_test)
import nltk
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')
foovec = CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)
train_counts = foovec.fit_transform(X_train)
print(train_counts)
train_counts.shape
foovec.vocabulary_
from os import listdir
from collections import Counter
# print the size of the vocab
print(len(foovec.vocabulary_))
# You may omit rare words for example if the occurrence is less than five times
# keep tokens with a min occurrence
min_occurane = 5
tokens = [k for k,c in foovec.vocabulary_.items() if c >= min_occurane]
print(tokens[1:1000])
print(len(tokens))
Probability of the occurrence • P[“the”] = num of documents containing ‘the’ / num of all documents Conditional probability based on the sentiment
words=["the"]
sentences = X_train
count=0
for sentence in sentences :
for word in words :
if word in sentence :
count=count+1
#print(count)
#print(count)
num_of_documents_containing_the=count
print(num_of_documents_containing_the)
num_of_all_documents=40000
print(num_of_all_documents)
Probability_of_the=num_of_documents_containing_the/num_of_all_documents
print(Probability_of_the)
# Now take the positive sentiment data from training set
train_data=data[:4000]
positive_docs=train_data.loc[train_data['sentiment']!=0]
positive_docs.head()
# make the list of positive sentiment
train_pos_reviews=positive_docs['review']
train_pos_voca=train_pos_reviews.values.tolist()
train_pos_voca[1:5]
# Join the positive sentiment with single dot
train_pos_voca='.'.join(train_pos_voca)
# Now calculate the number of positive documents having the
words=["the"]
sentences = train_pos_voca
count=0
for sentence in sentences :
for word in words :
if word in sentence :
count=count+1
#print(count)
#print(count)
num_of_pos_documents_containing_the=count
print(num_of_pos_documents_containing_the)
# Find the totl positive documents in training data set
num_of_all_pos_documents=positive_docs['review'].count()
print(num_of_all_pos_documents)
# Now calculate P[“the” | Positive] = # of positive documents containing “the” / num of all positive review documents
probability_0f_the_in_positive_docs=num_of_pos_documents_containing_the/num_of_all_pos_documents
print(probability_0f_the_in_positive_docs)
# Conduct five fold cross validation
# Convert the data in vector fpormate
tf_idf_vect = TfidfVectorizer(ngram_range=(1,2))
tf_idf_train = tf_idf_vect.fit_transform(X_train)
tf_idf_test = tf_idf_vect.transform(X_test)
alpha_range = list(np.arange(0,10,1))
len(alpha_range)
# take different values of alpha in cross validation and finding the accuracy score
from sklearn.naive_bayes import MultinomialNB
alpha_scores=[]
for a in alpha_range:
clf = MultinomialNB(alpha=a)
scores = cross_val_score(clf, tf_idf_train, y_train, cv=5, scoring='accuracy')
alpha_scores.append(scores.mean())
print(a,scores.mean())
# Plot b/w misclassification error and CV mean score.
import matplotlib.pyplot as plt
MSE = [1 - x for x in alpha_scores]
optimal_alpha_bnb = alpha_range[MSE.index(min(MSE))]
# plot misclassification error vs alpha
plt.plot(alpha_range, MSE)
plt.xlabel('hyperparameter alpha')
plt.ylabel('Misclassification Error')
plt.show()
optimal_alpha_bnb
# For alpha =1, we have got minimum misscalculation error
Compare the effect of Smoothing Derive Top 10 words that predicts positive and negative class • P[Positive| word]
# Effects of non-smoothing and smoothing
# Now lets see the highest positive and negative words that has highest sentiment prediction capacity
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
# Now we will remove stop words as it does not carry significant meaning and will store positive and negative word for selections
stop = set(stopwords.words('english'))
sno = nltk.stem.SnowballStemmer('english')
def cleanhtml(sentence):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, ' ', sentence)
return cleantext
def cleanpunc(sentence):
cleaned = re.sub(r'[?|!|\'|"|#]',r'',sentence)
cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
return cleaned
i=0
str1=' '
final_string=[]
all_positive_words=[]
all_negative_words=[]
s=''
for sent in data['review'].values:
filtered_sentence=[]
sent=cleanhtml(sent)
for w in sent.split():
for cleaned_words in cleanpunc(w).split():
if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
if(cleaned_words.lower() not in stop):
s=(sno.stem(cleaned_words.lower())).encode('utf8')
filtered_sentence.append(s)
if (data['score'].values)[i] == 1:
all_positive_words.append(s)
if(data['score'].values)[i] == 0:
all_negative_words.append(s)
else:
continue
else:
continue
str1 = b" ".join(filtered_sentence)
final_string.append(str1)
i+=1
total_positive_words = len(all_positive_words)
total_negative_words = len(all_negative_words)
print(total_positive_words)
print(total_negative_words)
import random
apw = random.sample(all_positive_words, 10000)
anw = random.sample(all_negative_words, 10000)
freq_negative_words = {x:anw.count(x) for x in anw}
freq_positive_words = {x:apw.count(x) for x in apw}
#Lets see positive sentiment first
lst=[]
for key in freq_positive_words:
prob = freq_positive_words[key]/total_positive_words
lst.append([key,prob])
table_positive = pd.DataFrame(lst,columns=['positive_words','probability'])
table_positive = table_positive.sort_values('probability', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')
table_positive.head(20)
from operator import itemgetter
posi={}
i=0
for key, value in sorted(freq_positive_words.items(), key = itemgetter(1), reverse = True):
if(i>10):
break
posi[key]=value
i+=1
posi
plt.bar(range(len(posi)), list(posi.values()), align='center')
plt.xticks(range(len(posi)), list(posi.keys()))
print("Top 10 words that predicts positive sentiment")
plt.show()
# Now lets see top 10 negative sentiment words
lst=[]
for key in freq_negative_words:
prob = freq_negative_words[key]/total_negative_words
lst.append([key,prob])
table_negative = pd.DataFrame(lst,columns=['negative_words','probability'])
table_negative = table_negative.sort_values('probability', axis=0, ascending=False, inplace=False, kind='quicksort', na_position='last')
table_negative.head(20)
nega={}
i=0
for key, value in sorted(freq_negative_words.items(), key = itemgetter(1), reverse = True):
if(i>10):
break
nega[key]=value
i+=1
nega
plt.bar(range(len(nega)), list(nega.values()), align='center')
plt.xticks(range(len(nega)), list(nega.keys()))
print("Top 10 words that predicts negative sentiment")
plt.show()
Use the optimal hyperparameters you found in the step e, and use it to calculate the final accuracy.
optimal_alpha_bnb
# For alpha =1, we have got minimum misscalculation error
# Now lets see Naive bayes model
clf = MultinomialNB(alpha=1)
clf.fit(tf_idf_train,y_train)
y_pred_test = clf.predict(tf_idf_test)
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, y_pred_test, normalize=True) * float(100)
print('\n****Test accuracy is',(acc))
# Now lets see the confusion matrix to see the performance in visualization of classification algorithm
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn import metrics
cm_test = confusion_matrix(y_test,y_pred_test)
cm_test
sns.heatmap(cm_test,annot=True,fmt='d')
# Now lets see the train acuracy
y_pred_train = clf.predict(tf_idf_train)
acc = accuracy_score(y_train, y_pred_train, normalize=True) * float(100)
print('\n****Train accuracy is %d%%' % (acc))
cm_train = confusion_matrix(y_train,y_pred_train)
cm_train
sns.heatmap(cm_train,annot=True,fmt='d')